import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as mpimg
1 - age (numeric)
2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
5 - default: has credit in default? (categorical: 'no','yes','unknown')
6 - housing: has housing loan? (categorical: 'no','yes','unknown')
7 - loan: has personal loan? (categorical: 'no','yes','unknown')
# related with the last contact of the current campaign:
8 - contact: contact communication type (categorical: 'cellular','telephone')
9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# other attributes:
12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14 - previous: number of contacts performed before this campaign and for this client (numeric)
15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
# social and economic context attributes
16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
17 - cons.price.idx: consumer price index - monthly indicator (numeric)
18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
20 - nr.employed: number of employees - quarterly indicator (numeric)
21 - y - has the client subscribed a term deposit? (binary: 'yes','no')
df = pd.read_csv("../DATA/bank-full.csv")
#based on domain experiance , for marketing purposes we may need to cluster clients to 3 groups :
#took a loan,didn't take a loan ,unknown .
plt.figure(figsize=(12,6),dpi=200)
sns.histplot(data=df,x='age',hue='loan')
<AxesSubplot:xlabel='age', ylabel='Count'>
# pdays: number of days that passed by after the client was last contacted from a previous campaign
# (numeric; 999 means client was not previously contacted)
plt.figure(figsize=(12,6),dpi=200)
sns.histplot(data=df[df['pdays']!=999],x='pdays')
<AxesSubplot:xlabel='pdays', ylabel='Count'>
# 1000 is 16 minutes
plt.figure(figsize=(12,6),dpi=200)
sns.histplot(data=df,x='duration',hue='contact')
<AxesSubplot:xlabel='duration', ylabel='Count'>
plt.figure(figsize=(12,6),dpi=200)
sns.countplot(data=df,x='education',order=df['education'].value_counts().index)
plt.xticks(rotation=90);
we can't use categorical data, K mean clustering is distance based algorithim -> use dummy variables.
we need to scale the data , k mean clustering is distance based algorithim -> scale the features
X=pd.get_dummies(df)
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
# No data leakage as we do'nt have our label
X=scaler.fit_transform(X)
from sklearn.cluster import KMeans
model=KMeans(n_clusters=2)
# 1- will fit all features to the model to find the cluster centers
# 2- we will predict the features would belong to which cluster
# Note if you run the cell more thean once you will get 0,0,0,,...,1,1,1
# the zeros and ones are meaningless they're just labels of the clusters
#as what matters that the same rows belong to the same clusters
cluster_labels=model.fit_predict(X)
cluster_labels
array([1, 1, 1, ..., 0, 0, 0])
df=pd.get_dummies(df)
df['Cluster'] =cluster_labels
plt.figure(figsize=(12,6),dpi=200)
df.corr()['Cluster'].iloc[:-1].sort_values().plot(kind='bar')
<AxesSubplot:>
ssd = []
for k in range(2,10):
model = KMeans(n_clusters=k)
model.fit(X)
#Sum of squared distances of samples to their closest cluster center.
ssd.append(model.inertia_)
ssd
[2469792.3616627543, 2370786.446603645, 2271502.8081971155, 2228290.0533834356, 2157695.015264023, 2074338.1385483479, 2076251.5749846818, 1995548.640403869]
plt.plot(range(2,10),ssd,'o--')
plt.xlabel("K Value")
plt.ylabel(" Sum of Squared Distances")
Text(0, 0.5, ' Sum of Squared Distances')
# Change in SSD from previous K value!
pd.Series(ssd).diff()
0 NaN 1 -99005.915059 2 -99283.638407 3 -43212.754814 4 -70595.038119 5 -83356.876716 6 1913.436436 7 -80702.934581 dtype: float64
image_as_array=mpimg.imread("Hadeel's_me_time.jpg")
# (R,G,B)
image_as_array
array([[[174, 153, 168], [175, 154, 169], [175, 154, 169], ..., [ 95, 96, 101], [ 96, 96, 104], [ 96, 96, 104]], [[175, 157, 171], [175, 157, 171], [176, 158, 172], ..., [ 95, 96, 101], [ 95, 95, 103], [ 95, 95, 103]], [[178, 160, 174], [179, 161, 175], [180, 162, 176], ..., [ 96, 97, 102], [ 95, 95, 103], [ 95, 95, 103]], ..., [[172, 127, 150], [172, 127, 150], [172, 127, 150], ..., [201, 225, 225], [201, 225, 225], [201, 225, 225]], [[172, 127, 150], [172, 127, 150], [172, 127, 150], ..., [202, 226, 226], [200, 224, 224], [200, 224, 224]], [[172, 127, 150], [171, 126, 149], [171, 126, 149], ..., [202, 226, 226], [199, 223, 223], [199, 223, 223]]], dtype=uint8)
plt.figure(figsize=(6,6),dpi=100)
plt.imshow(image_as_array)
<matplotlib.image.AxesImage at 0x2547155cac8>
# 3d array => 1280 width , 1162 height , 3 channels
# First Dimension (Height - h) → Represents the number of rows of pixels.
# Second Dimension (Width - w) → Represents the number of columns of pixels.
# Third Dimension (Channels - c) → Represents color information (typically 3 for RGB images).
(h,w,rgb)=image_as_array.shape
(h,w,rgb)
(1280, 1162, 3)
# 2D array => height*width , 3 channels
#Rows → Each row corresponds to a single pixel in the original image
#Columns → Each column stores the RGB values for that pixel.
images_as_2d_array = image_as_array.reshape(h*w,rgb)
images_as_2d_array.shape
(1487360, 3)
Kmeans is designed to train on 2D data (data rows and feature columns), so we can reshape the above strip by using (h,w,c) ---> (h * w,c)
from sklearn.cluster import KMeans
model= KMeans(n_clusters=20)
label=model.fit_predict(images_as_2d_array)
# 10 colors with rgb code
model.cluster_centers_
array([[179.08497623, 156.92511884, 144.75479109], [219.02596897, 221.15505002, 214.22445037], [104.81497285, 91.95223318, 83.54635376], [189.74249677, 172.71689254, 165.89847001], [181.18202613, 216.36212161, 228.68702586], [227.59565859, 220.00657499, 160.73084914], [167.62981414, 122.98687488, 145.0799323 ], [206.66526934, 207.06678184, 199.68490644], [217.50141054, 208.61443746, 148.80899082], [208.69399195, 153.46626657, 103.00016488], [ 39.04528227, 37.31329971, 36.27710809], [136.46617339, 111.22182378, 107.30387131], [132.0054727 , 153.92413463, 167.36051443], [185.50186031, 145.66167307, 164.82556187], [159.0311138 , 195.61132216, 213.05045797], [194.92928672, 229.25751486, 239.88826036], [159.31620303, 139.733546 , 122.44314397], [ 72.52850016, 69.40758237, 67.08604276], [194.91417204, 193.64641021, 189.2989296 ], [180.01698129, 191.71842945, 138.31592104]])
label
array([13, 13, 13, ..., 15, 15, 15])
rgb_codes = model.cluster_centers_.round(0).astype(int)
quantized_image = np.reshape(rgb_codes[label], (h, w, rgb))
plt.figure(figsize=(6,6),dpi=100)
plt.imshow(quantized_image)
<matplotlib.image.AxesImage at 0x254715d9748>